Problem Set #2

library(tidyverse)
library(EmpiricalIO2020)
set.seed(1)

1. Objective function

  • Let \[ X_{ij}\beta \equiv \beta_{0}+\beta_{1}\cdot\mathrm{age}_{i}\cdot\mathrm{hp}_{j}+\beta_{2}\cdot\text{ gender }_{i}\cdot\mathrm{fe}_{j}. \]
  • Then, the liklihood function is \[ L=\prod_{i=1}^{N}\prod_{j=1}^{J}\left\{ \frac{\exp(X_{ij}\beta)}{\sum_{k=1}^{J}\exp(X_{ik}\beta)}\right\} ^{y_{ij}} \]
    • \(y_{ij}\) is a dummy variable indicates \(i\) chooses \(j\).
  • Taking the log of it, we have \[\begin{align*} \log L & =\sum_{i=1}^{N}\sum_{j=1}^{J}y_{ij}\log\left\{ \frac{\exp(X_{ij}\beta)}{\sum_{k=1}^{J}\exp(X_{ik}\beta)}\right\} \end{align*}\]
    • This is the objective function we want to maximize.

2. Estimation

  • Import the data and name the column.
    • I add id column, which indicates each individual.
individual <- read_csv("../input/data/Data2020GIOPS2.csv",
               col_names = c("choice", "age", "gender")) 
Parsed with column specification:
cols(
  choice = col_double(),
  age = col_double(),
  gender = col_double()
)
individual <- individual %>% 
  mutate(id = 1:NROW(individual)) %>% 
  select(id, everything())    # order the columns

N <- NROW(individual)

individual
  • Next, I compute the mean utility for each function.
  • To do so, I make a data frame X that contains product characteristics, where rows correspond to alternatives and columns to dimensions of characteristics.
    • I add the outside option row.
X <- tibble(
  j = 0:3,
  x_1 = c(0, 1, 1.2, 1.4),
  x_2 = c(0, 5, 3.5, 2)
)
X
  • Using this, I merge X and individual.
    • I make eps variable, which is \(\varepsilon_{ij}\) following Gumbel distribution.
    • The variable y is the same as the lecture note: \(y_{ij}=1\) if \(i\) chooses \(j\)
  • HW2_df_join can be found here
match <- tibble(
    id = rep(1:N, each = 4),
    j = rep(0:3, times = N)
  )
df <- HW2_df_join(individual, X, match) %>% 
  mutate(eps = evd::rgumbel(NROW(match)))
df
  • Then, I write a function to compute the mean utility, delta, and the random utility, u.
compute_mean_utility <- function(beta, df){
  df_u <- df %>% 
    mutate(delta = beta[1] + beta[2] * age * x_1 + beta[3] * gender * x_2) %>%
    mutate(u = delta + eps) %>% 
    mutate(delta = if_else(j == 0, 0, delta),
           u = if_else(j == 0, 0, u)) 
  return(df_u)
}
compute_mean_utility(c(1,2,3), df)
  • Then, I write a function to compute the choice probabilies.
compute_choice_prob <- function(beta, df){
  df_u <- compute_mean_utility(beta, df)
  df_choice <- df_u %>% 
    mutate(exp_delta = exp(delta)) %>% 
    group_by(id) %>% 
    mutate(sum_exp_delta = sum(exp_delta),
           prob_choice = exp_delta / sum_exp_delta) %>%
    ungroup()
  return(df_choice)
}
compute_choice_prob(c(1,2,3), df)
  • Finally, I compute the (log-) likelihood.
likelihood <- function(beta, df){
  lkhd <- compute_choice_prob(beta, df) %>%
    mutate(log_prob_choice = log(prob_choice)) %>% 
    mutate(y_log_prob_choice = y * log_prob_choice) %>% 
    pull(y_log_prob_choice)
  return(sum(lkhd))
}
likelihood(c(1,2,3), df)
[1] -171477.5
  • With log likehood functioin at hand, we can optimize it.
beta_init <- c(-1, 1, 0.5)    # initial guess
optim_result <- optim(beta_init, likelihood, df = df, control=list(fnscale = -1), method = "Nelder-Mead")
optim_result
$par
[1] -1.88642927  0.09716284 -1.02709908

$value
[1] -8361.77

$counts
function gradient 
     174       NA 

$convergence
[1] 0

$message
NULL

3. Standard Errors

  • I calculate the standard errors using bootstrap.
S <- 1000
result <- array(dim = c(3, S))
beta_init <- c(-2, 0.1, -1)
eps <- evd::rgumbel(NROW(match))    # random shocks should be fixed outside the simulation loop
for(i in 1:S){
  random_id <- sample(N, N, replace = TRUE)
  match_bs <- tibble(
   id = rep(random_id, each = 4),
   j = rep(0:3, times = N),
   eps = eps
   )
  df_bs <- HW2_df_join(individual, X, match_bs)
  result[,i] <- optim(beta_init, likelihood, df = df_bs, method = "Nelder-Mead")$par
}
se_1 <- sd(result[1,])
se_2 <- sd(result[2,])
se_3 <- sd(result[3,])

paste("standard error for beta_0 is ", round(se_1, 3))
[1] "standard error for beta_0 is  0.414"
paste("standard error for beta_1 is ", round(se_2, 3))
[1] "standard error for beta_1 is  0.046"
paste("standard error for beta_2 is ", round(se_3, 3))
[1] "standard error for beta_2 is  1.734"